library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
library(forecast)
## Warning: package 'forecast' was built under R version 4.3.3
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(fpp3)
## Warning: package 'fpp3' was built under R version 4.3.3
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.1 ──
## ✔ tibble 3.2.1 ✔ tsibble 1.1.5
## ✔ dplyr 1.1.3 ✔ tsibbledata 0.4.1
## ✔ tidyr 1.3.0 ✔ feasts 0.4.1
## ✔ lubridate 1.9.2 ✔ fable 0.4.1
## Warning: package 'dplyr' was built under R version 4.3.2
## Warning: package 'tsibble' was built under R version 4.3.3
## Warning: package 'tsibbledata' was built under R version 4.3.3
## Warning: package 'feasts' was built under R version 4.3.3
## Warning: package 'fabletools' was built under R version 4.3.3
## Warning: package 'fable' was built under R version 4.3.3
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tsibble::setdiff() masks base::setdiff()
## ✖ tsibble::union() masks base::union()
library(patchwork)
## Warning: package 'patchwork' was built under R version 4.3.3
library(fable)
library(fabletools)
library(tsibble)
library(dplyr)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ purrr 1.0.2 ✔ stringr 1.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(feasts)
library(lubridate)
library(fable)
library(fabletools)
library(lubridate)
library(broom)
library(glue)
rm(list = ls())
View(PBS)
A filtered dataset where Cost has variability within each group.
PBS_filtered <- PBS %>%
group_by(Concession, Type, ATC1, ATC2) %>%
# Keeps only groups where the variance of Cost is greater than zero (i.e., removes groups where Cost is constant or missing)
filter(var(Cost, na.rm = TRUE) > 0) %>%
ungroup()
View(PBS_filtered)
Extract time series features from the Cost column of PBS_filtered using the feasts package and removes missing values.
PBS_feat has one row per group (Concession, Type, ATC1, ATC2).
PBS_feat <- PBS_filtered %>%
features(Cost, feature_set(pkgs = "feasts")) %>%
na.omit()
## Warning in optimise(lambda_coef_var, c(lower, upper), x = x, .period =
## max(.period, : NA/Inf replaced by maximum positive value
## Warning in optimise(lambda_coef_var, c(lower, upper), x = x, .period =
## max(.period, : NA/Inf replaced by maximum positive value
## Warning in optimise(lambda_coef_var, c(lower, upper), x = x, .period =
## max(.period, : NA/Inf replaced by maximum positive value
## Warning in optimise(lambda_coef_var, c(lower, upper), x = x, .period =
## max(.period, : NA/Inf replaced by maximum positive value
View(PBS_feat)
Filter PBS_feat to remove rows containing non-finite numeric values (i.e., Inf, -Inf, or NaN).
PBS_feat <- PBS_feat %>%
filter_if(is.numeric, all_vars(is.finite(.)))
Perform Principal Component Analysis (PCA) on the PBS_feat dataset while retaining metadata for further analysis.
PBS_prcomp <- PBS_feat %>%
# Remove categorical/grouping columns to retain only numerical features for PCA.
select(-Concession, -Type, -ATC1, -ATC2) %>%
# Performs PCA on the selected numeric features, standardizing them (scale = TRUE ensures mean 0 and variance 1).
prcomp(scale = TRUE) %>%
# Adds the PCA results (principal component scores) back to the original PBS_feat dataset.
augment(PBS_feat)
View(PBS_prcomp)
colnames(PBS_prcomp)
## [1] ".rownames" "Concession" "Type"
## [4] "ATC1" "ATC2" "trend_strength"
## [7] "seasonal_strength_year" "seasonal_peak_year" "seasonal_trough_year"
## [10] "spikiness" "linearity" "curvature"
## [13] "stl_e_acf1" "stl_e_acf10" "acf1"
## [16] "acf10" "diff1_acf1" "diff1_acf10"
## [19] "diff2_acf1" "diff2_acf10" "season_acf1"
## [22] "pacf5" "diff1_pacf5" "diff2_pacf5"
## [25] "season_pacf" "zero_run_mean" "nonzero_squared_cv"
## [28] "zero_start_prop" "zero_end_prop" "lambda_guerrero"
## [31] "kpss_stat" "kpss_pvalue" "pp_stat"
## [34] "pp_pvalue" "ndiffs" "nsdiffs"
## [37] "bp_stat" "bp_pvalue" "lb_stat"
## [40] "lb_pvalue" "var_tiled_var" "var_tiled_mean"
## [43] "shift_level_max" "shift_level_index" "shift_var_max"
## [46] "shift_var_index" "shift_kl_max" "shift_kl_index"
## [49] "spectral_entropy" "n_crossing_points" "longest_flat_spot"
## [52] "coef_hurst" "stat_arch_lm" ".fittedPC1"
## [55] ".fittedPC2" ".fittedPC3" ".fittedPC4"
## [58] ".fittedPC5" ".fittedPC6" ".fittedPC7"
## [61] ".fittedPC8" ".fittedPC9" ".fittedPC10"
## [64] ".fittedPC11" ".fittedPC12" ".fittedPC13"
## [67] ".fittedPC14" ".fittedPC15" ".fittedPC16"
## [70] ".fittedPC17" ".fittedPC18" ".fittedPC19"
## [73] ".fittedPC20" ".fittedPC21" ".fittedPC22"
## [76] ".fittedPC23" ".fittedPC24" ".fittedPC25"
## [79] ".fittedPC26" ".fittedPC27" ".fittedPC28"
## [82] ".fittedPC29" ".fittedPC30" ".fittedPC31"
## [85] ".fittedPC32" ".fittedPC33" ".fittedPC34"
## [88] ".fittedPC35" ".fittedPC36" ".fittedPC37"
## [91] ".fittedPC38" ".fittedPC39" ".fittedPC40"
## [94] ".fittedPC41" ".fittedPC42" ".fittedPC43"
## [97] ".fittedPC44" ".fittedPC45" ".fittedPC46"
## [100] ".fittedPC47" ".fittedPC48"
An interactive PCA scatter plot where hovering over points reveals their corresponding series labels.
p1 <- PBS_prcomp %>%
# Combines Concession, Type, ATC1, and ATC2 into a single identifier (serie), separated by "-", while keeping original columns.
unite("serie", Concession:ATC2, sep = "-", remove = FALSE) %>%
# Plots the first two principal components (.fittedPC1 and .fittedPC2), using serie as labels.
ggplot(aes(x = .fittedPC1, y = .fittedPC2, label = serie)) +
geom_point() +
labs(title = "PCA of PBS Series with PC1 and PC2")
# Converts the ggplot2 plot into an interactive plotly visualization.
plotly::ggplotly(p1)
summary(PBS_prcomp$.fittedPC1)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -7.329 -3.507 1.205 0.000 2.963 9.361
A faceted time series plot highlighting the outlying series in PCA space.
# Select rows from PBS_prcomp where the first principal component (.fittedPC1) is greater than 6, identifying potential outliers.
outliers <- PBS_prcomp %>%
filter(.fittedPC1 > 6)
print(outliers)
## # A tibble: 4 × 101
## .rownames Concession Type ATC1 ATC2 trend_strength seasonal_strength_year
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 134 Concessional Safe… J J06 0.0917 0.159
## 2 190 General Co-p… C C05 0.105 0.245
## 3 243 General Co-p… S S02 0.140 0.219
## 4 244 General Co-p… S S03 0.162 0.257
## # ℹ 94 more variables: seasonal_peak_year <dbl>, seasonal_trough_year <dbl>,
## # spikiness <dbl>, linearity <dbl>, curvature <dbl>, stl_e_acf1 <dbl>,
## # stl_e_acf10 <dbl>, acf1 <dbl>, acf10 <dbl>, diff1_acf1 <dbl>,
## # diff1_acf10 <dbl>, diff2_acf1 <dbl>, diff2_acf10 <dbl>, season_acf1 <dbl>,
## # pacf5 <dbl>, diff1_pacf5 <dbl>, diff2_pacf5 <dbl>, season_pacf <dbl>,
## # zero_run_mean <dbl>, nonzero_squared_cv <dbl>, zero_start_prop <dbl>,
## # zero_end_prop <dbl>, lambda_guerrero <dbl>, kpss_stat <dbl>, …
PBS %>%
# Filter PBS to keep only the series corresponding to outliers.
semi_join(outliers, by = c("Concession", "Type", "ATC1", "ATC2")) %>%
# Plot the Cost time series of these outliers, faceted by the grouping variables.
autoplot(Cost) +
facet_grid(vars(Concession, Type, ATC1, ATC2)) +
labs(title = "Time Series of Outliers Identified in PCA Space")
Plot for “Concessional - Safety Net - J - J06”
PBS %>%
semi_join(outliers, by = c("Concession", "Type", "ATC1", "ATC2")) %>%
filter(Concession == "Concessional", Type == "Safety net", ATC1 == "J", ATC2 == "J06") %>%
autoplot(Cost) +
labs(title = "Time Series of Outlier: Concessional - Safety Net - J - J06")
Plot for “General - Co-payments - C - C05”
PBS %>%
semi_join(outliers, by = c("Concession", "Type", "ATC1", "ATC2")) %>%
filter(Concession == "General", Type == "Co-payments", ATC1 == "C", ATC2 == "C05") %>%
autoplot(Cost) +
labs(title = "Time Series of Outlier: General - Co-payments - C - C05")
Plot for “General - Co-payments - S - S02”
PBS %>%
semi_join(outliers, by = c("Concession", "Type", "ATC1", "ATC2")) %>%
filter(Concession == "General", Type == "Co-payments", ATC1 == "S", ATC2 == "S02") %>%
autoplot(Cost) +
labs(title = "Time Series of Outlier: General - Co-payments - S - S02")
Plot for “General - Co-payments - S - S03”
PBS %>%
semi_join(outliers, by = c("Concession", "Type", "ATC1", "ATC2")) %>%
filter(Concession == "General", Type == "Co-payments", ATC1 == "S", ATC2 == "S03") %>%
autoplot(Cost) +
labs(title = "Time Series of Outlier: General - Co-payments - S - S03")
The unusual characteristics of the identified outlier series are: